In [1]:
import org.apache.spark._
import org.apache.spark.rdd._
import org.apache.spark.mllib.classification._
import org.apache.spark.mllib.regression._
import org.apache.spark.mllib.linalg._
In [2]:
println(sc.version)
In [3]:
def loadCsv(path: String): RDD[LabeledPoint] = {
import scala.util.Try
sc.textFile(path).flatMap { line =>
Try {
val row = line.split(",").map { _.toInt }
LabeledPoint(
label = row.head,
features = new DenseVector(row.tail.map { _.toDouble })
)
}.toOption
}
}
In [4]:
val Array(train, test) = loadCsv("mnist_train.csv").randomSplit(Array(0.8, 0.2), seed=333L)
/** We will use training and test datsets more than once.
* To avoid recalculation they should be persisted (cached).
*/
train.persist()
test.persist()
Out[4]:
In [5]:
/** Default logistic regression. **/
val logreg = new LogisticRegressionWithLBFGS().setNumClasses(10)
In [6]:
/** The same could be done by `LogisticRegression.train` methods **/
val trainedModel = logreg.run(train)
In [7]:
/** Construction `arg: { def f(...): ... }` means any class that contains
* a method with the same name and signature.
*/
def testAccuracy(model: { def predict(f: Vector): Double })(data: RDD[LabeledPoint]): Double = {
val prediction = data.map { lp =>
val pred = model.predict(lp.features)
(lp.label, pred)
}
import org.apache.spark.mllib.evaluation._
val metrics = new MulticlassMetrics(prediction)
val cm = metrics.confusionMatrix
val correct = (0 until cm.numCols).map { i => cm(i, i) }.sum
val total = cm.toArray.sum
correct / total
}
In [1]:
println {
testAccuracy(trainedModel)(test)
}
In [9]:
val logreg = new LogisticRegressionWithLBFGS().setNumClasses(10)
logreg.optimizer.setRegParam(0.1)
val trainedModel = logreg.run(train)
In [2]:
println {
testAccuracy(trainedModel)(test)
}
In [11]:
def rocauc(model: { def predict(f: Vector): Double })(data: RDD[LabeledPoint]): Double = {
val prediction = data.map { lp =>
val pred = model.predict(lp.features)
(lp.label, pred)
}
import org.apache.spark.mllib.evaluation._
val metrics = new BinaryClassificationMetrics(prediction)
metrics.areaUnderROC()
}
In [12]:
val digit0_train = train.map { lp => lp.copy(label = if (lp.label == 0) 1.0 else 0.0) }
val digit0_test = test.map { lp => lp.copy(label = if (lp.label == 0) 1.0 else 0.0) }
digit0_train.persist()
digit0_test.persist()
val svm = SVMWithSGD.train(digit0_train, numIterations=250, stepSize=0.1, regParam=0.1, miniBatchFraction=0.1)
In [13]:
rocauc(svm)(digit0_test)
Out[13]:
In [14]:
testAccuracy(svm)(digit0_test)
Out[14]:
In [15]:
import org.apache.spark.mllib.tree.RandomForest
In [16]:
val rf = RandomForest.trainClassifier(input = train, numClasses=10,
categoricalFeaturesInfo=Map.empty[Int, Int],
numTrees=5,
featureSubsetStrategy="log2", impurity ="gini", maxDepth=25, maxBins = 100, seed=333)
In [17]:
testAccuracy(rf)(test)
Out[17]:
In [18]:
import org.apache.spark.mllib.tree._
import org.apache.spark.mllib.tree.configuration._
In [19]:
val boostingStrategy = BoostingStrategy.defaultParams("Classification")
boostingStrategy.numIterations = 2
boostingStrategy.treeStrategy.numClasses = 2
boostingStrategy.treeStrategy.maxDepth = 25
boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map.empty[Int, Int]
println {
boostingStrategy
}
In [20]:
val gbt = GradientBoostedTrees.train(train, boostingStrategy)
In [21]:
testAccuracy(gbt)(test)
Out[21]:
In [22]:
val boostingStrategy = BoostingStrategy.defaultParams("Regression")
boostingStrategy.numIterations = 2
boostingStrategy.treeStrategy.maxDepth = 25
boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map.empty[Int, Int]
println {
boostingStrategy
}
In [23]:
val gbt = GradientBoostedTrees.train(train, boostingStrategy)
In [24]:
testAccuracy(gbt)(test)
Out[24]: